Loading the required data
daily.counts <- read.csv("Intermediate_results/regularity_of_study/weekly_counts_of_daily_logins_w2-13.csv")
#colnames(daily.counts)
weekly.counts <- daily.counts %>%
select(user_id, W2_cnt:W13_cnt, tot_cnt, weekly_entropy)
# str(weekly.counts)
daily.gaps <- read.csv("Intermediate_results/regularity_of_study/gaps_between_consecutive_logins_w2-13.csv")
# str(daily.gaps)
# daily gaps do not have normal distribution, so, median will be used
# merge weekly counts and median time gap
counts.data <- merge(x = weekly.counts, y = daily.gaps %>% select(user_id, median_gap),
by = 'user_id', all = TRUE)
exam.scores <- read.csv(file = "Intermediate_results/exam_scores_with_student_ids.csv")
# remove email data
exam.scores <- exam.scores %>% select(-2)
# str(exam.scores)
# merge counts data with exam scores
counts.data <- merge(x = counts.data, y = exam.scores, by.x = 'user_id', by.y = 'USER_ID',
all.x = T, all.y = F)
#summary(counts.data)
# 9 NA values for exam scores; remove them
counts.data <- counts.data %>% filter( is.na(SC_FE_TOT)==FALSE )
This means that predictors are counts of active days (days when a student had at least one learning session) per week, entropy of weekly active days, and median gap between two consecutive active days.
lm1.data <- counts.data %>% select(-c(tot_cnt, user_id, SC_MT_TOT))
lm1 <- lm(SC_FE_TOT ~ ., data = lm1.data)
summary(lm1)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm1.data)
Residuals:
Min 1Q Median 3Q Max
-23.2397 -6.3099 -0.6918 5.9104 19.9677
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.2257 2.3354 3.522 0.00047 ***
W2_cnt 0.4985 0.3431 1.453 0.14685
W3_cnt 0.0926 0.3651 0.254 0.79991
W4_cnt 0.1341 0.3532 0.380 0.70441
W5_cnt -0.3261 0.3228 -1.010 0.31290
W6_cnt -0.1632 0.3676 -0.444 0.65736
W7_cnt 0.5343 0.4188 1.276 0.20266
W8_cnt 1.0919 0.4068 2.685 0.00752 **
W9_cnt 0.1106 0.3862 0.286 0.77475
W10_cnt 1.0810 0.3454 3.130 0.00186 **
W11_cnt 0.1618 0.3891 0.416 0.67768
W12_cnt 0.4559 0.3298 1.383 0.16746
W13_cnt 0.8561 0.2901 2.951 0.00333 **
weekly_entropy 11.6096 9.9681 1.165 0.24475
median_gap -0.2072 0.3777 -0.549 0.58350
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.417 on 462 degrees of freedom
Multiple R-squared: 0.2831, Adjusted R-squared: 0.2614
F-statistic: 13.03 on 14 and 462 DF, p-value: < 2.2e-16
It’s interesting that counts for only 3 weeks are significant and that all three weeks are in the 2nd part of the course (after midterm exam):
R-squared is 0.283 (adjusted R2: 0.261).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm1$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm1)
par(mfrow=c(1,1)) # Change back to 1 x 1
# there are few potential influential points: 80, 50, 459
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:14)
print(cor.test(lm1.data[,c], lm1$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm1)
# OK, values below or equal to 2
The assumptions are satisifed, though there are few potentially influential points that might need to be considered if this model is to be used
lm2.data <- counts.data %>% select(tot_cnt, median_gap, weekly_entropy, SC_FE_TOT)
lm2 <- lm(SC_FE_TOT ~ ., data = lm2.data)
summary(lm2)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm2.data)
Residuals:
Min 1Q Median 3Q Max
-25.649 -6.003 -0.660 5.943 20.407
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.84084 2.17939 2.680 0.00762 **
tot_cnt 0.40120 0.04530 8.857 < 2e-16 ***
median_gap -0.05991 0.37409 -0.160 0.87284
weekly_entropy 11.87966 9.90829 1.199 0.23114
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.476 on 473 degrees of freedom
Multiple R-squared: 0.2557, Adjusted R-squared: 0.251
F-statistic: 54.17 on 3 and 473 DF, p-value: < 2.2e-16
The total number of active days is the only significant predictor, and it is highly significant. Each additional active day contributes 0.4 points to the final exam score.
R-squared is 0.2557 (adjusted R2: 0.251).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm2$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm2)
par(mfrow=c(1,1)) # Change back to 1 x 1
# both OK
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:3)
print(cor.test(lm2.data[,c], lm2$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm2)
# OK, values below or equal to 2
All assumptions are satisified.
Loading the required data
weekly.sessions <- read.csv("Intermediate_results/regularity_of_study/weekly_session_props.csv")
#str(weekly.sessions)
ses.gap.data <- read.csv("Intermediate_results/regularity_of_study/inter-session_time_intervals.csv") #str(ses.gap.data)
lm3.data <- merge(x = weekly.sessions %>% select(count_w2:count_w12, weekly_entropy, user_id),
y = ses.gap.data %>% select(user_id, median_s_gap),
by = 'user_id', all = TRUE)
lm3.data <- merge(x = lm3.data, y = exam.scores %>% select(USER_ID, SC_FE_TOT),
by.x = 'user_id', by.y = 'USER_ID', all.x = T, all.y = F)
# summary(lm3.data)
lm3.data <- lm3.data %>% filter(is.na(SC_FE_TOT)==FALSE) %>% select(-user_id)
lm3 <- lm(SC_FE_TOT ~ ., data = lm3.data)
summary(lm3)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm3.data)
Residuals:
Min 1Q Median 3Q Max
-21.6292 -5.9597 -0.8594 5.9990 21.2523
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 15.2754010 1.6511046 9.252 < 2e-16 ***
count_w2 0.0053628 0.1110213 0.048 0.96149
count_w3 0.1909662 0.1157368 1.650 0.09962 .
count_w4 -0.0832153 0.1005015 -0.828 0.40810
count_w5 -0.2802013 0.0866320 -3.234 0.00131 **
count_w7 0.2530987 0.1498286 1.689 0.09184 .
count_w8 0.1688238 0.1441518 1.171 0.24214
count_w9 0.1951703 0.1603609 1.217 0.22420
count_w10 0.5158746 0.1080020 4.777 2.4e-06 ***
count_w11 0.3756283 0.1545471 2.431 0.01546 *
count_w12 0.1057666 0.1333451 0.793 0.42808
weekly_entropy 25.4435546 8.4577490 3.008 0.00277 **
median_s_gap -0.0001634 0.0003508 -0.466 0.64167
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.31 on 463 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.2938, Adjusted R-squared: 0.2755
F-statistic: 16.05 on 12 and 463 DF, p-value: < 2.2e-16
Significant predictors:
R-squared: 0.294 (adjusted R2: 0.275).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm3$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm3)
par(mfrow=c(1,1)) # Change back to 1 x 1
# mostly fine, but there are few (potentially) influential points: 412, 459, 437, 77
# let's examine them
lm3.data[c(412,459,437,77),]
summary(lm3.data)
# 437 has very low engagement and very high exam score (35)
# 459 has high engagement (at times very high) and zero (0) exam score
# 412 is similar to 459, but not that extreme (7 exam score; less active)
# 77 is almost completely inactive, and has zero (0) exam score
## assumption 4: predictors and residuals are uncorrelated
lm3.data <- lm3.data %>% filter(is.na(median_s_gap)==FALSE)
for(c in 1:12)
print(cor.test(lm3.data[,c], lm3$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm3)
# OK, values below or slightly above 2
lm4.data <- merge(x = weekly.sessions %>% select(user_id, s_total, weekly_entropy),
y = ses.gap.data %>% select(-mad_s_gap),
by = 'user_id', all = TRUE)
lm4.data <- merge(x = lm4.data, y = exam.scores %>% select(USER_ID, SC_FE_TOT),
by.x = 'user_id', by.y = 'USER_ID', all.x = T, all.y = F)
lm4.data <- lm4.data %>% filter( is.na(SC_FE_TOT)==FALSE ) %>% select(-user_id)
lm4 <- lm(SC_FE_TOT ~ ., data = lm4.data)
summary(lm4)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm4.data)
Residuals:
Min 1Q Median 3Q Max
-29.4102 -6.3591 -0.7392 6.5142 19.4481
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.526e+01 1.658e+00 9.205 < 2e-16 ***
s_total 1.235e-01 1.383e-02 8.926 < 2e-16 ***
weekly_entropy 3.188e+01 8.598e+00 3.708 0.000233 ***
median_s_gap 2.975e-05 3.607e-04 0.082 0.934310
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.599 on 472 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.2292, Adjusted R-squared: 0.2243
F-statistic: 46.78 on 3 and 472 DF, p-value: < 2.2e-16
Significant predictors:
R-squared: 0.229 (adjusted R2: 0.224).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm4$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm4)
par(mfrow=c(1, 1))
# the Residuals vs Fitted plot suggests that there might be some non-linear realtionship between the outcome and the predictors
# there are also few influential points: 412, 459, 376, 77
# let's examine them
lm4.data[c(412,459,376, 77),]
summary(lm4.data)
# 77 is a clear outlier
# 376 has relatively high engagement (above 3rd quartile), but very low exam score (4)
# 412 and 459 have already been examined before
## assumption 4: predictors and residuals are uncorrelated
lm4.data <- lm4.data %>% filter(is.na(median_s_gap)==FALSE)
for(c in 1:3)
print(cor.test(lm4.data[,c], lm4$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm4)
# OK, values below 2
Loading the data
weekday.sessions <- read.csv("Intermediate_results/regularity_of_study/weekday_session_props.csv")
#str(weekday.sessions)
lm5.data <- merge(x = weekday.sessions %>% select(1:8, 11),
y = exam.scores %>% select(-SC_MT_TOT),
by.x = "user_id", by.y = "USER_ID",
all.x = TRUE, all.y = FALSE)
# summary(lm5.data)
lm5.data <- lm5.data %>% filter( is.na(SC_FE_TOT)==FALSE ) %>% select(-user_id)
lm5 <- lm(SC_FE_TOT ~ ., data = lm5.data)
summary(lm5)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm5.data)
Residuals:
Min 1Q Median 3Q Max
-31.2357 -6.1439 -0.9892 6.7715 20.2779
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 15.07048 2.26948 6.641 8.69e-11 ***
Sun_count 0.08458 0.09785 0.864 0.387818
Mon_count 0.20397 0.04877 4.182 3.44e-05 ***
Tue_count 0.11681 0.03721 3.140 0.001799 **
Wed_count 0.10067 0.04187 2.405 0.016578 *
Thu_count 0.16141 0.04133 3.905 0.000108 ***
Fri_count 0.12135 0.10226 1.187 0.235970
Sat_count -0.02262 0.12568 -0.180 0.857269
weekday_entropy 17.61256 6.86108 2.567 0.010567 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.663 on 468 degrees of freedom
Multiple R-squared: 0.2307, Adjusted R-squared: 0.2176
F-statistic: 17.55 on 8 and 468 DF, p-value: < 2.2e-16
Significant predictors:
R-squared: 0.231 (adjusted R2: 0.218).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm5$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm5)
par(mfrow=c(1,1)) # Change back to 1 x 1
# mostly fine, but there are few potentially influential points: usual suspects (412, 459), 230
# let's examine them
lm5.data[c(412,459,230),]
summary(lm5.data)
# 230 has low engagement and very high exam score (35)
# 459 and 412 have already been considered
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:8)
print(cor.test(lm5.data[,c], lm5$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm5)
# OK, values below 2
As predictors, use total counts of different kinds of resources students used during their active days (an active day is a day when a student had at least one study session). The types of resources considered:
In addition, consider using:
Loading the data…
res.use.stats <- read.csv("Intermediate_results/regularity_of_study/daily_resource_use_statistics_w2-5_7-12.csv")
#str(res.use.stats)
lm6.data <- merge(res.use.stats, exam.scores, by.x = "user_id", by.y = "USER_ID", all.x = T, all.y = F)
lm6.data <- lm6.data %>% select(-c(user_id, SC_MT_TOT)) %>% filter( is.na(SC_FE_TOT)==FALSE )
lm6_1.data <- lm6.data %>% select( starts_with("tot"), starts_with("prop"), SC_FE_TOT)
# examine the presence of (high) correlation between the variables
ggcorr(lm6_1.data, method = c("complete","spearman"),
# geom = "circle", min_size = 0, max_size = 15,
label = TRUE, label_size = 3.5,
hjust = 0.85, size = 4, layout.exp = 1)
# tot_mcog_cnt and prop_mcog_used are highly correlated, as are tot_video_cnt and prop_video_used, and tot_mcq_cnt and prop_mcq_used
lm6_1.data <- lm6_1.data %>% select(-c(prop_mcog_used, prop_video_used, prop_mcq_used))
# remove the outliers and re-run the model
lm6_1.data <- lm6_1.data[-c(86, 412, 462, 459),]
lm6_1 <- lm(SC_FE_TOT ~., data = lm6_1.data)
summary(lm6_1)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm6_1.data)
Residuals:
Min 1Q Median 3Q Max
-22.0996 -5.9578 -0.1087 6.5627 20.6982
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.505e+01 2.572e+01 2.529 0.0118 *
tot_video_cnt 1.314e-03 8.603e-04 1.528 0.1273
tot_exe_cnt -6.820e-03 1.328e-03 -5.135 4.16e-07 ***
tot_mcq_cnt 8.755e-03 3.661e-03 2.392 0.0172 *
tot_mcog_cnt 9.811e-03 1.281e-02 0.766 0.4441
tot_res_cnt 1.008e-02 1.955e-03 5.156 3.74e-07 ***
prop_exe_used 4.138e-01 3.661e+00 0.113 0.9100
prop_res_used -4.626e+01 2.576e+01 -1.796 0.0732 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.798 on 465 degrees of freedom
Multiple R-squared: 0.1922, Adjusted R-squared: 0.1801
F-statistic: 15.81 on 7 and 465 DF, p-value: < 2.2e-16
Significant predictors:
R-squared: 0.192 (adjusted R2: 0.180).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm6_1$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm6_1)
par(mfrow=c(1, 1))
# unclear if homoscedasticity requirement is fulfilled; check using this plot:
plot(fitted(lm6_1), resid(lm6_1,type="pearson"), col="blue", xlab = "Fitted Values", ylab = "Residuals")
abline(h=0,lwd=2)
lines(smooth.spline(fitted(lm6_1), residuals(lm6_1)), lwd=2, col='red')
# not that good
# # the plots point to couple of outliers: 86, 412, 462, 459
# # let's check them:
# lm6_1.data[c(86, 412, 462, 459),]
# summary(lm6_1.data)
# # 459 and 462 have zero exam score, inspite of non-negligible number of learning events (especially 459)
# # 412 was highly active, but had very low exam score (7)
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:7)
print(cor.test(lm6_1.data[,c], lm6_1$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm6_1)
# it's fine: all below or equal to 2
The assumption of homoscedasticity cannot be considered satisfied (even after removing outliers)
# include those engagment indicators that proved at least slightly relevant in the previous model
# plus mad_X_cnt as indicators of regularity
lm6_2.data <- lm6.data %>% select(tot_mcq_cnt, tot_exe_cnt, tot_res_cnt, prop_res_used,
starts_with("mad"), SC_FE_TOT)
# examine the presence of (high) correlation between the variables
ggcorr(lm6_2.data, method = c("complete","spearman"),
# geom = "circle", min_size = 0, max_size = 15,
label = TRUE, label_size = 3.5,
hjust = 0.85, size = 4, layout.exp = 1)
# exclude mad_res_cnt as highly correlated with tot_res_cnt (which proved significant)
lm6_2.data <- lm6_2.data %>% select(-mad_res_cnt)
lm6_2 <- lm(SC_FE_TOT ~., data = lm6_2.data)
summary(lm6_2)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm6_2.data)
Residuals:
Min 1Q Median 3Q Max
-22.9335 -6.1144 -0.1174 6.8471 23.1086
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 66.127636 26.170686 2.527 0.011840 *
tot_mcq_cnt 0.012136 0.003640 3.334 0.000923 ***
tot_exe_cnt -0.006543 0.001360 -4.813 2.01e-06 ***
tot_res_cnt 0.010416 0.002000 5.208 2.87e-07 ***
prop_res_used -47.054649 26.296683 -1.789 0.074201 .
mad_video_cnt -0.007622 0.081861 -0.093 0.925861
mad_exe_cnt -0.003117 0.020326 -0.153 0.878197
mad_mcq_cnt -0.543694 0.379258 -1.434 0.152362
mad_mcog_cnt -0.952299 1.089663 -0.874 0.382599
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.983 on 468 degrees of freedom
Multiple R-squared: 0.1728, Adjusted R-squared: 0.1586
F-statistic: 12.22 on 8 and 468 DF, p-value: 6.383e-16
None of the MAD variables is significant
As predictors, use total number of learning actions (during active days) with a particular topic focus; possible topic foci:
In addition, consider including the following basic statistics:
Loading the required data…
topic.stats <- read.csv("Intermediate_results/regularity_of_study/topic_counts_statistics_w2-5_7-12.csv")
# str(topic.stats)
lm7.data <- merge(topic.stats, exam.scores, by.x = "user_id", by.y = "USER_ID", all.x = T, all.y = F)
lm7.data <- lm7.data %>% select(-c(user_id, SC_MT_TOT)) %>% filter( is.na(SC_FE_TOT)==FALSE )
lm7_1.data <- lm7.data %>% select( starts_with("tot"), ends_with("prop"), SC_FE_TOT)
summary(lm7_1.data)
# examine the presence of (high) correlation between the variables
ggcorr(lm7_1.data, method = c("complete","spearman"),
# geom = "circle", min_size = 0, max_size = 15,
label = TRUE, label_size = 3.5,
hjust = 0.85, size = 4, layout.exp = 1)
# exclude tot_orient_cnt and orinet_prop as they are highly correlated with some other variables
lm7_1.data <- lm7_1.data %>% select(-c(tot_orient_cnt, orient_prop))
# exclude tot_prj_cnt, due to high VIF
lm7_1.data <- lm7_1.data %>% select(-tot_prj_cnt)
lm7_1 <- lm(SC_FE_TOT ~ ., data = lm7_1.data)
summary(lm7_1)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm7_1.data)
Residuals:
Min 1Q Median 3Q Max
-22.4947 -6.5546 -0.6803 6.5096 21.4971
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.592e+01 1.040e+01 4.413 1.27e-05 ***
tot_ontopic_cnt 4.854e-03 9.985e-04 4.861 1.60e-06 ***
tot_revisit_cnt -5.237e-03 1.455e-03 -3.600 0.000352 ***
tot_metacog_cnt 1.191e-02 3.565e-03 3.341 0.000901 ***
ontopic_prop 8.082e-01 3.322e+00 0.243 0.807890
revisit_prop 1.940e+00 3.106e+00 0.625 0.532537
metacog_prop -3.469e+01 1.017e+01 -3.410 0.000705 ***
prj_prop 4.500e+00 4.893e+00 0.920 0.358177
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 9.063 on 469 degrees of freedom
Multiple R-squared: 0.1562, Adjusted R-squared: 0.1436
F-statistic: 12.4 on 7 and 469 DF, p-value: 1.347e-14
Significant predictors:
R-squared: 0.156 (adjusted R2: 0.145).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm7_1$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm7_1)
par(mfrow=c(1, 1))
# normality is fine
# unclear if homoscedasticity requirement is fulfilled; check using this plot:
plot(fitted(lm7_1), resid(lm7_1,type="pearson"), col="blue",
xlab = "Fitted Values", ylab = "Residuals")
abline(h=0,lwd=2)
lines(smooth.spline(fitted(lm7_1), residuals(lm7_1)), lwd=2, col='red')
# it's fine
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:8)
print(cor.test(lm7_1.data[,c], lm7_1$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm7_1)
# now, it's fine
# include those engagment indicators that proved at least slightly relevant in the previous model
# plus mad_X_cnt as indicators of regularity
lm7_2.data <- lm7.data %>% select(tot_ontopic_cnt, tot_revisit_cnt, tot_metacog_cnt, metacog_prop,
starts_with("mad"), SC_FE_TOT)
# examine the presence of (high) correlation between the variables
ggcorr(lm7_2.data, method = c("complete","spearman"),
# geom = "circle", min_size = 0, max_size = 15,
label = TRUE, label_size = 3.5,
hjust = 0.85, size = 4, layout.exp = 1)
# exclude tot_metacog_cnt as highly correlated with mad_metacog_cnt and mad_orient_cnt
lm7_2.data <- lm7_2.data %>% select(-c(tot_metacog_cnt, mad_orient_cnt))
lm7_2 <- lm(SC_FE_TOT ~., data = lm7_2.data)
summary(lm7_2)
Call:
lm(formula = SC_FE_TOT ~ ., data = lm7_2.data)
Residuals:
Min 1Q Median 3Q Max
-20.586 -6.413 -1.047 6.841 22.230
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.254e+01 9.826e+00 4.330 1.83e-05 ***
tot_ontopic_cnt 7.177e-03 9.323e-04 7.698 8.27e-14 ***
tot_revisit_cnt -3.802e-03 1.352e-03 -2.812 0.00513 **
metacog_prop -2.679e+01 1.019e+01 -2.629 0.00885 **
mad_ontopic_cnt -1.257e-01 4.010e-02 -3.135 0.00182 **
mad_revisit_cnt -9.064e-02 7.364e-02 -1.231 0.21897
mad_metacog_cnt -3.913e-02 1.476e-01 -0.265 0.79102
mad_prj_cnt -7.659e-01 4.124e+00 -0.186 0.85276
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 9.1 on 469 degrees of freedom
Multiple R-squared: 0.1493, Adjusted R-squared: 0.1366
F-statistic: 11.76 on 7 and 469 DF, p-value: 8.013e-14
The only regularity indicator that proved significant: mad_ontopic_cnt - one unit increase in MAD of ontopic counts leads to a decrease of 0.126 points in the final exam score
R2: 0.1493 (adjusted R2: 0.1366).
Checking if the model satisfies the assumptions for linear regression:
# assumption 1: the mean of residuals is zero
mean(lm7_2$residuals)
# OK
# assumption 2: homoscedasticity of residuals or equal variance
# assumption 3: Normality of residuals
par(mfrow=c(2, 2))
plot(lm7_2)
par(mfrow=c(1, 1))
# normality is fine
# unclear if homoscedasticity requirement is fulfilled; check using this plot:
plot(fitted(lm7_2), resid(lm7_2,type="pearson"), col="blue",
xlab = "Fitted Values", ylab = "Residuals")
abline(h=0,lwd=2)
lines(smooth.spline(fitted(lm7_2), residuals(lm7_2)), lwd=2, col='red')
# not bad
# a few influential points: 60, 54, 202
# and a few outliers: 19, 294, 50
## assumption 4: predictors and residuals are uncorrelated
for(c in 1:7)
print(cor.test(lm7_2.data[,c], lm7_2$residuals))
# OK
## assumption 6: no multicolinearity between explanatory variables
vif(lm7_2)
# OK
A few outliers and (potentially) influential points; apart from that, it’s fine